Analyze selection data using soluble Ephrin-B2 or -B3¶
In [1]:
# this cell is tagged as parameters for `papermill` parameterization
#input configs
altair_config = None
nipah_config = None
#input files
func_scores_E2_file = None
binding_E2_file = None
func_scores_E3_file = None
binding_E3_file = None
#output files
filtered_E2_binding_data = None
filtered_E3_binding_data = None
#output images
entry_binding_combined_corr_plot = None
entry_binding_combined_corr_plot_agg = None
E2_binding_heatmap = None
E3_binding_heatmap = None
E2_E3_correlation = None
E2_E3_correlation_site = None
combined_E2_E3_site_corr = None
binding_by_site_plot = None
entropy_file = None
In [2]:
# Parameters
func_scores_E2_file = "results/func_effects/averages/CHO_EFNB2_low_func_effects.csv"
binding_E2_file = "results/receptor_affinity/averages/EFNB2_monomeric_mut_effect.csv"
func_scores_E3_file = "results/func_effects/averages/CHO_EFNB3_low_func_effects.csv"
binding_E3_file = "results/receptor_affinity/averages/EFNB3_dimeric_mut_effect.csv"
filtered_E2_binding_data = "results/filtered_data/E2_binding_filtered.csv"
filtered_E3_binding_data = "results/filtered_data/E3_binding_filtered.csv"
entry_binding_combined_corr_plot = (
"results/images/entry_binding_combined_corr_plot.html"
)
entry_binding_combined_corr_plot_agg = (
"results/images/entry_binding_combined_corr_plot_agg.html"
)
E2_binding_heatmap = "results/images/E2_binding_heatmap.html"
E3_binding_heatmap = "results/images/E3_binding_heatmap.html"
nipah_config = "nipah_config.yaml"
altair_config = "data/custom_analyses_data/theme.py"
E2_E3_correlation = "results/images/E2_E3_correlation.html"
E2_E3_correlation_site = "results/images/E2_E3_correlation_site.html"
combined_E2_E3_site_corr = "results/images/combined_E2_E3_site_corr.html"
entropy_file = "results/entropy/entropy.csv"
binding_by_site_plot = "results/images/binding_by_site_plot.html"
In [3]:
import math
import os
import re
import altair as alt
import numpy as np
import pandas as pd
import scipy.stats
import yaml
In [4]:
# allow more rows for Altair
_ = alt.data_transformers.disable_max_rows()
if os.getcwd() == '/fh/fast/bloom_j/computational_notebooks/blarsen/2023/Nipah_Malaysia_RBP_DMS/':
pass
print("Already in correct directory")
else:
os.chdir("/fh/fast/bloom_j/computational_notebooks/blarsen/2023/Nipah_Malaysia_RBP_DMS/")
print("Setup in correct directory")
Setup in correct directory
In [5]:
##hard paths in case don't want to run with snakemake
#
#altair_config = "data/custom_analyses_data/theme.py"
#nipah_config = "nipah_config.yaml"
#
##input files
#func_scores_E2_file = "results/func_effects/averages/CHO_EFNB2_low_func_effects.csv"
#binding_E2_file = "results/receptor_affinity/averages/EFNB2_monomeric_mut_effect.csv"
#func_scores_E3_file = "results/func_effects/averages/CHO_EFNB3_low_func_effects.csv"
#binding_E3_file = "results/receptor_affinity/averages/EFNB3_dimeric_mut_effect.csv"
#
##output files
#filtered_E2_binding_data = "results/filtered_data/E2_binding_filtered.csv"
#filtered_E3_binding_data = "results/filtered_data/E3_binding_filtered.csv"
#
###output images
#E2_binding_entry = "results/images/E2_binding_entry.html"
#E3_binding_entry = "results/images/E3_binding_entry.html"
#E2_binding_heatmap = None
#E3_binding_heatmap = None
#E2_E3_correlation = None
#E2_E3_correlation_site = None
#
#entropy_file = "results/entropy/entropy.csv"
Run config files to setup altair theme and config variables¶
In [6]:
if altair_config:
with open(altair_config, 'r') as file:
exec(file.read())
with open(nipah_config) as f:
config = yaml.safe_load(f)
Make the E2/E3 dataframes, filter separately, then merge¶
In [7]:
e2 = pd.read_csv(binding_E2_file)
e2_func = pd.read_csv(func_scores_E2_file)
e3 = pd.read_csv(binding_E3_file)
e3_func = pd.read_csv(func_scores_E3_file)
def merge_func_binding_dfs(func,binding,name):
df_int = pd.merge(
binding,
func,
on=['site','mutant','wildtype'],
suffixes=['_affinity','_cell_entry'],
validate='one_to_one',
how='outer'
)
df = df_int.rename(columns={'Ephrin binding_mean':'binding_mean','Ephrin binding_std':'binding_std','Ephrin binding_median':'binding_median'})
#only filter func effects
df_pre_filter = df[
(df['mutant'] != '*') &
(df['mutant'] != '-') &
(df['site'] != 603) &
(df['effect'] >= config['min_func_effect_for_binding']) &
(df['times_seen_cell_entry'] >= config['func_times_seen_cutoff']) &
(df['effect_std'] <= config['func_std_cutoff'])
]
#Now filter binding
df_post_filter = df_pre_filter[
(df_pre_filter['times_seen_affinity'] >= config['min_times_seen_binding']) &
(df_pre_filter['binding_std'] <= config['max_binding_std']) &
(df_pre_filter['frac_models'] >= config['frac_models'])
]
def plot_affinity_corr(df):
if name == 'E2':
color = '#1f4e79'
else:
color = '#ff7f0e'
chart = alt.Chart(df).mark_point(size=30, color='black', opacity=0.2, filled=True).encode(
x=alt.X('effect', title=f'{name} Cell Entry', axis=alt.Axis(grid=True)),
y=alt.Y('binding_median', title=f'{name} Binding', axis=alt.Axis(grid=True)),
tooltip=['site', 'wildtype', 'mutant'],
).properties(
width=alt.Step(10),
height=alt.Step(10),
)
return chart.display()
def plot_times_seen_std(df):
if name == 'E2':
color = '#1f4e79'
else:
color = '#ff7f0e'
chart = alt.Chart(df).mark_circle(size=20,color='black',opacity=0.2).encode(
x=alt.X('times_seen_affinity',axis=alt.Axis(grid=True),title=f'Times Seen for {name}',scale=alt.Scale(type='log')),
y=alt.Y('binding_std',title='Binding Std',axis=alt.Axis(grid=True)),
tooltip=['site','times_seen_affinity','effect_std','effect']
).properties(
height=alt.Step(10),
width=alt.Step(10)
)
return chart.display()
plot_affinity_corr(df)
entry_vs_binding = plot_affinity_corr(df_post_filter)
#entry_vs_binding.display()
#entry_vs_binding.save(entry_vs_binding)
plot_times_seen_std(df)
plot_times_seen_std(df_post_filter)
#For pulling out low effect mutants for heatmaps later
def store_filtered_info(df):
df_filter = df[
(df['mutant'] != '*') &
(df['mutant'] != '-') &
(df['site'] != 603) &
(df['effect'] < config['min_func_effect_for_binding']) &
(df['times_seen_cell_entry'] >= config['func_times_seen_cutoff']) &
(df['effect_std'] <= config['func_std_cutoff'])
]
return df_filter
df_low_effect_filter = store_filtered_info(df)
return df_post_filter,df_low_effect_filter
df_E2_filter,df_E2_filter_missing = merge_func_binding_dfs(e2_func,e2,'EFNB2')
df_E3_filter,df_E3_filter_missing = merge_func_binding_dfs(e3_func,e3,'EFNB3')
def plot_corr_binding_entry(df,name):
chart = alt.Chart(df).mark_point(size=30, color='black', opacity=0.2, filled=True).encode(
x=alt.X('effect', title=f'{name} Cell Entry', axis=alt.Axis(grid=True)),
y=alt.Y('binding_median', title=f'{name} Binding', axis=alt.Axis(grid=True)),
tooltip=['site', 'wildtype', 'mutant','binding_median','times_seen_affinity','effect'],
).properties(
width=alt.Step(10),
height=alt.Step(10),
)
return chart
#E2_binding_entry_tmp = plot_corr_binding_entry(df_E2_filter,'EFNB2')
#E2_binding_entry_tmp.display()
#E2_binding_entry_tmp.save(E2_binding_entry)
#E3_binding_entry_tmp = plot_corr_binding_entry(df_E3_filter,'EFNB3')
#E3_binding_entry_tmp.display()
#E3_binding_entry_tmp.save(E3_binding_entry)
#Save filtered dataframes for crystal structure mapping
df_E2_filter.to_csv(filtered_E2_binding_data,index=False)
df_E3_filter.to_csv(filtered_E3_binding_data,index=False)
#Now that they are filtered, merge EFNB2 and EFNB3
df_affinity_filter_merge = pd.merge(
df_E2_filter,
df_E3_filter,
on=['site','wildtype','mutant'],
suffixes=['_E2','_E3'],
how='outer'
)
#Add columns that calculate difference between EFNB2 and EFNB3 cell entry and EFNB2 and EFNB3 binding.
df_affinity_filter_merge['func_effect_diff'] = (df_affinity_filter_merge['effect_E2'] - df_affinity_filter_merge['effect_E3']).abs()
df_affinity_filter_merge['binding_effect_diff'] = (df_affinity_filter_merge['binding_mean_E2'] - df_affinity_filter_merge['binding_mean_E3']).abs()
#display stats
display(df_affinity_filter_merge[['binding_std_E2','binding_std_E3','binding_median_E2','binding_median_E3']].describe())
| binding_std_E2 | binding_std_E3 | binding_median_E2 | binding_median_E3 | |
|---|---|---|---|---|
| count | 6657.000000 | 6563.000000 | 6657.000000 | 6563.000000 |
| mean | 0.497838 | 0.179371 | -0.325661 | -0.017590 |
| std | 0.317808 | 0.172135 | 1.079307 | 0.270343 |
| min | 0.004012 | 0.000018 | -5.284000 | -1.960000 |
| 25% | 0.271600 | 0.060175 | -0.406000 | -0.150050 |
| 50% | 0.424900 | 0.134200 | -0.025360 | -0.014280 |
| 75% | 0.644100 | 0.244750 | 0.189900 | 0.116300 |
| max | 1.989000 | 1.795000 | 2.335000 | 2.008000 |
Make nice interactive plot for correlation between binding and entry for EFNB2 and EFNB3¶
In [8]:
#process dataframes for plotting
def fix_dfs_for_plotting(df,selection):
tmp_df = df[['site','wildtype','mutant','binding_median','binding_std','times_seen_affinity','effect','effect_std','times_seen_cell_entry']]
tmp_df['cell_type'] = selection
return tmp_df
new_e2 = fix_dfs_for_plotting(df_E2_filter,'EFNB2')
new_e3 = fix_dfs_for_plotting(df_E3_filter,'EFNB3')
e2_e3_merge = pd.concat([new_e2,new_e3])
#display(e2_e3_merge)
def plot_corr_binding_entry_updated(df,flag):
variant_selector = alt.selection_point(
on="mouseover",
empty=False,
fields=["site","mutant"],
value=0
)
variant_selector_agg = alt.selection_point(
on="mouseover",
empty=False,
fields=["site"],
value=0
)
slider = alt.binding_range(min=2, max=10, step=1, name="times seen")
selector = alt.param(name="SelectorName", value=2, bind=slider)
empty_chart = []
for cell in list(df['cell_type'].unique()):
tmp_df = df[df['cell_type'] == cell]
if flag == True:
agg_df = tmp_df.groupby('site')[['binding_median','effect']].median().reset_index()
#display(agg_df)
chart = alt.Chart(agg_df).mark_point(stroke='black',filled=True).encode(
x=alt.X('effect', title=f'Median {cell} Cell Entry', axis=alt.Axis(grid=True)),
y=alt.Y('binding_median', title=f'Median {cell} Binding', axis=alt.Axis(grid=True)),
opacity=alt.condition(variant_selector_agg, alt.value(1), alt.value(0.2)),
size=alt.condition(variant_selector_agg,alt.value(100),alt.value(50)),
strokeWidth=alt.condition(variant_selector_agg,alt.value(1),alt.value(0)),
color=alt.condition(variant_selector_agg,alt.value('orange'),alt.value('black')),
tooltip=['site', 'binding_median','effect'],
).properties(
width=200,
height=200,
).add_params(variant_selector_agg)
empty_chart.append(chart)
else:
chart = alt.Chart(tmp_df).mark_point(stroke='black',filled=True).encode(
x=alt.X('effect', title=f'{cell} Cell Entry', axis=alt.Axis(grid=True)),
y=alt.Y('binding_median', title=f'{cell} Binding', axis=alt.Axis(grid=True)),
opacity=alt.condition(variant_selector, alt.value(1), alt.value(0.1)),
size=alt.condition(variant_selector,alt.value(50),alt.value(20)),
strokeWidth=alt.condition(variant_selector,alt.value(1),alt.value(0)),
color=alt.condition(variant_selector,alt.value('orange'),alt.value('black')),
tooltip=['site', 'wildtype', 'mutant','binding_median','times_seen_affinity','effect'],
).properties(
width=200,
height=200,
).add_params(variant_selector,selector).transform_filter(
alt.datum.times_seen_affinity >= selector
)
empty_chart.append(chart)
combined_chart = alt.hconcat(*empty_chart,title=alt.Title('Correlation between binding and entry'))
return combined_chart
entry_binding_corr_plot = plot_corr_binding_entry_updated(e2_e3_merge,False)
entry_binding_corr_plot.display()
entry_binding_corr_plot.save(entry_binding_combined_corr_plot)
entry_binding_corr_plot_agg = plot_corr_binding_entry_updated(e2_e3_merge,True)
entry_binding_corr_plot_agg.display()
entry_binding_corr_plot_agg.save(entry_binding_combined_corr_plot_agg)
/loc/scratch/38046115/ipykernel_24037/3545742511.py:4: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy tmp_df['cell_type'] = selection /loc/scratch/38046115/ipykernel_24037/3545742511.py:4: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy tmp_df['cell_type'] = selection
In [9]:
def overall_stats(df,effect,name):
#Find quantiles
quantile_2 = df['binding_median'].quantile(.02)
quantile_98 = df['binding_median'].quantile(.98)
print(f'The 2% quantile for {name} is: {quantile_2}')
print(f'The 98% quantile for {name} is: {quantile_98}')
#Now group sites and find intolerant sites
filtered_df = df.groupby('site').filter(lambda group: (group[effect] <-0.25).all())
unique = filtered_df['site'].unique()
# Convert unique to a Pandas Series
unique_series = pd.Series(unique)
#print(unique_series)
# Find the common elements
unique_contact_bool = unique_series.isin(config['contact_sites'])
#print(unique_contact_bool)
# Filter and get the common elements
common_elements = unique_series[unique_contact_bool]
# Print the common elements
print(f'Here are the contact sites that are conserved: {common_elements}')
print(f'There are {len(unique)} sites with all negative binding score mutants for {name}')
print(f'These are the sites for {name} with all negative binding score mutants: {list(unique)}')
#Now find sites with low and high binding (median)
median_df = df.groupby('site')['binding_median'].median().reset_index().sort_values(by='binding_median')
print(f'For {name}, these are the sites with lowest median binding scores: {median_df.head(5)}')
median_df = df.groupby('site')['binding_median'].median().reset_index().sort_values(by='binding_median',ascending=False)
print(f'For {name}, these are the sites with highest median binding scores: {median_df.head(5)}')
#Now calculate mutant number
total_mutants = df.shape[0]
upper_cutoff = df[df[effect] > 1].sort_values(by='binding_median',ascending=False)
median_upper = upper_cutoff['effect'].median()
print(f'The median entry score for top binders was: {median_upper}')
mutants_above_cutoff_tolerated = upper_cutoff[upper_cutoff['effect'] > 0]
mutants_above_cutoff_tolerated = mutants_above_cutoff_tolerated[['site','effect','binding_median','wildtype','mutant']]
print(f'The mutants with positive entry scores and good binding are: {mutants_above_cutoff_tolerated.head(5)}')
lower_cutoff = df[df[effect] < -1]
print(f'For {name}, there are a total of : {total_mutants} binding mutants')
print(f'For {name}, there are {upper_cutoff.shape[0]} mutants above cutoff, and {mutants_above_cutoff_tolerated.shape[0]} that have good entry scores')
print(f'For {name}, there are {lower_cutoff.shape[0]} mutants below cutoff')
total_sites = df['site'].unique().shape[0]
print(f'The total number of sites are: {total_sites}')
overall_stats(df_E2_filter,'binding_median','E2')
overall_stats(df_E3_filter,'binding_median','E3')
The 2% quantile for E2 is: -3.8724000000000003 The 98% quantile for E2 is: 1.112 Here are the contact sites that are conserved: 5 242 11 389 23 488 24 490 25 491 29 501 30 504 31 505 33 531 34 532 35 533 36 557 37 579 38 581 41 588 dtype: int64 There are 44 sites with all negative binding score mutants for E2 These are the sites for E2 with all negative binding score mutants: [116, 206, 220, 236, 238, 242, 243, 248, 346, 351, 352, 389, 390, 398, 399, 400, 438, 439, 441, 460, 467, 486, 487, 488, 490, 491, 494, 495, 497, 501, 504, 505, 526, 531, 532, 533, 557, 579, 581, 584, 585, 588, 590, 594] For E2, these are the sites with lowest median binding scores: site binding_median 443 533 -4.045 407 494 -4.025 404 490 -3.941 402 487 -3.868 414 504 -3.820 For E2, these are the sites with highest median binding scores: site binding_median 133 208 1.38740 48 120 1.36300 59 132 1.24100 132 207 1.20775 250 331 1.12100 The median entry score for top binders was: -0.7596 The mutants with positive entry scores and good binding are: site effect binding_median wildtype mutant 1324 139 0.00505 1.989 N Y 5448 354 0.22810 1.498 S T 9533 566 0.08127 1.415 F H 7170 444 0.16280 1.256 I F 1217 134 0.09948 1.249 S I For E2, there are a total of : 6657 binding mutants For E2, there are 178 mutants above cutoff, and 22 that have good entry scores For E2, there are 970 mutants below cutoff The total number of sites are: 511 The 2% quantile for E3 is: -0.634456 The 98% quantile for E3 is: 0.601376 Here are the contact sites that are conserved: 2 389 4 488 8 501 9 531 10 532 dtype: int64 There are 11 sites with all negative binding score mutants for E3 These are the sites for E3 with all negative binding score mutants: [108, 352, 389, 486, 488, 494, 495, 497, 501, 531, 532] For E3, these are the sites with lowest median binding scores: site binding_median 414 501 -0.91625 440 531 -0.72305 308 389 -0.68695 463 555 -0.65450 405 488 -0.64390 For E3, these are the sites with highest median binding scores: site binding_median 493 589 0.67100 85 159 0.56090 56 129 0.47550 59 132 0.46735 87 161 0.44775 The median entry score for top binders was: -0.7568 The mutants with positive entry scores and good binding are: site effect binding_median wildtype mutant 7995 492 0.49590 1.200 Q L 2632 211 0.03675 1.149 G F 10015 598 0.34370 1.141 P G 1655 161 0.23430 1.011 S E For E3, there are a total of : 6563 binding mutants For E3, there are 25 mutants above cutoff, and 4 that have good entry scores For E3, there are 11 mutants below cutoff The total number of sites are: 507
Find sites with opposite effects on binding¶
In [10]:
#find sites that are different
def find_biggest_differences(df):
df = df[['site','wildtype','mutant','binding_median_E2',
'LibA-231112-EFNB2-monomeric','LibA-231222-BA6-CHO-EFNB2-monomeric','LibB-231108-EFNB2-monomeric', 'LibB-231222-BA6-CHO-EFNB2-monomeric',
'effect_E2',
'binding_median_E3','LibA-230825-EFNB3-dimeric','LibB-230907-EFNB3-dimeric','effect_E3','effect_std_E3','times_seen_cell_entry_E3','binding_effect_diff']]
#,'binding_effect_diff']]
df = df[df['site'].isin(config['contact_sites'])]
efnb2_good_efnb3_bad = df[
(df['binding_median_E2'] > 0.1) &
(df['binding_median_E3'] < -0.1)
].sort_values(by='binding_median_E2',ascending=False)
display(efnb2_good_efnb3_bad)
efnb2_bad_efnb3_good = df[
(df['binding_median_E2'] < -0.1) &
(df['binding_median_E3'] > 0.1)
].sort_values(by='binding_median_E3',ascending=False)
display(efnb2_bad_efnb3_good)
find_biggest_differences(df_affinity_filter_merge)
| site | wildtype | mutant | binding_median_E2 | LibA-231112-EFNB2-monomeric | LibA-231222-BA6-CHO-EFNB2-monomeric | LibB-231108-EFNB2-monomeric | LibB-231222-BA6-CHO-EFNB2-monomeric | effect_E2 | binding_median_E3 | LibA-230825-EFNB3-dimeric | LibB-230907-EFNB3-dimeric | effect_E3 | effect_std_E3 | times_seen_cell_entry_E3 | binding_effect_diff | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1837 | 241 | S | L | 0.8667 | 1.0410 | 0.61050 | 0.6921 | 1.2300 | -0.4249 | -0.1359 | -0.1972 | -0.07473 | 0.15840 | 0.2774 | 6.571 | 1.02940 |
| 1832 | 241 | S | F | 0.6864 | 0.7802 | 0.58160 | 0.5925 | 1.7230 | -0.3853 | -0.1101 | -0.4214 | 0.20110 | -0.22970 | 0.2284 | 6.857 | 1.02940 |
| 1833 | 241 | S | G | 0.1692 | 0.2446 | 0.09392 | 0.3091 | -0.5815 | -0.3386 | -0.2740 | -0.1346 | -0.41340 | -0.09642 | 0.2263 | 5.857 | 0.29052 |
| site | wildtype | mutant | binding_median_E2 | LibA-231112-EFNB2-monomeric | LibA-231222-BA6-CHO-EFNB2-monomeric | LibB-231108-EFNB2-monomeric | LibB-231222-BA6-CHO-EFNB2-monomeric | effect_E2 | binding_median_E3 | LibA-230825-EFNB3-dimeric | LibB-230907-EFNB3-dimeric | effect_E3 | effect_std_E3 | times_seen_cell_entry_E3 | binding_effect_diff | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 5266 | 492 | Q | L | -0.1671 | -0.06133 | -0.27290 | -0.73670 | 0.52310 | 0.51440 | 1.2000 | 1.22200 | 1.17800 | 0.4959 | 0.2963 | 4.857 | 1.3370 |
| 6489 | 588 | I | P | -2.2390 | -3.55400 | -1.65200 | -2.26100 | -2.21700 | -0.18690 | 1.0520 | 1.01300 | 1.09000 | -0.6374 | 0.3687 | 5.000 | 3.4730 |
| 5269 | 492 | Q | R | -2.9170 | -3.51500 | -2.60900 | -2.15100 | -3.22500 | 0.01257 | 0.6505 | 0.70400 | 0.59710 | 0.4092 | 0.2253 | 19.290 | 3.5255 |
| 5243 | 491 | S | A | -0.9052 | -1.08200 | -0.72840 | -0.40320 | -1.52200 | -0.59270 | 0.6041 | 0.57220 | 0.63610 | 0.2335 | 0.5233 | 6.143 | 1.5381 |
| 4014 | 402 | R | M | -0.4290 | -0.37740 | -0.15310 | -0.48070 | -0.82860 | -0.23740 | 0.3071 | 0.27330 | 0.34090 | -0.1438 | 0.2759 | 5.857 | 0.7670 |
| 6350 | 580 | I | E | -0.4390 | 0.88280 | -0.45010 | -0.42780 | -0.89910 | -0.57650 | 0.2554 | 0.15280 | 0.35790 | 0.5231 | 0.5856 | 5.143 | 0.4790 |
| 1816 | 239 | S | G | -0.1330 | -0.25570 | -0.01024 | -1.12400 | 0.90290 | 0.14500 | 0.2176 | 0.09886 | 0.33640 | 0.3443 | 0.3839 | 2.000 | 0.3395 |
| 6105 | 559 | Q | L | -0.2814 | -1.42100 | -0.18540 | -0.37740 | -0.06623 | -0.77730 | 0.1812 | 0.17160 | 0.19090 | -0.5984 | 0.7335 | 4.571 | 0.6936 |
| 5235 | 490 | Q | L | -1.4730 | -1.50600 | -1.44000 | -1.57700 | -0.91850 | 0.41890 | 0.1462 | 0.33720 | -0.04486 | 0.5374 | 0.2450 | 5.857 | 1.5062 |
| 3834 | 388 | Q | Y | -0.1516 | -1.03200 | 0.04747 | 0.37960 | -0.35070 | 0.50160 | 0.1412 | 0.14140 | 0.14100 | 0.5493 | 0.0976 | 5.143 | 0.3800 |
| 3833 | 388 | Q | W | -0.2135 | -0.46200 | -0.06387 | -0.01505 | -0.36320 | 0.38560 | 0.1166 | 0.06678 | 0.16650 | -0.2053 | 0.2456 | 6.429 | 0.3426 |
| 1863 | 242 | R | W | -1.2840 | -1.73800 | -0.90460 | -0.37760 | -1.66300 | 0.09253 | 0.1141 | -0.01441 | 0.24250 | -0.4407 | 0.4435 | 6.286 | 1.2851 |
| 5419 | 507 | V | M | -1.3570 | -0.19970 | -0.99620 | -1.89000 | -1.71900 | 0.41880 | 0.1024 | 0.53630 | -0.33150 | -0.4414 | 0.5062 | 5.143 | 1.3034 |
Find correlations between EFNB2 and EFNB3 binding¶
In [11]:
def plot_affinity_solid(df):
slider = alt.binding_range(min=1, max=20, step=1, name="times_seen")
selector = alt.param(name="SelectorName", value=1, bind=slider)
df = df.dropna()
# calculate r value
slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(df['binding_median_E2'], df['binding_median_E3'])
r_value = float(r_value)
# make chart
chart = alt.Chart(df,title=alt.Title('Correlation between Mutant Binding Scores',subtitle=f'r={r_value:.2f}')).mark_point(color='black',size=30, opacity=0.2,filled=True).encode(
x=alt.X('binding_median_E2', title=('Ephrin-B2 Binding')),
y=alt.Y('binding_median_E3', title=('Ephrin-B3 Binding')),
tooltip=['site', 'wildtype','mutant','binding_median_E2','binding_median_E3','effect_E2','effect_E3'],
).properties(
width=300,
height=300
).add_params(selector).transform_filter(
(alt.datum.times_seen_affinity_E2 >= selector)
)
min = int(df['binding_median_E2'].min())
max = int(df['binding_median_E3'].max())
text = alt.Chart({'values':[{'x': min, 'y': max, 'text': f'r = {r_value:.2f}'}]}).mark_text(
align='left', baseline='top', dx=-10, dy=-20).encode(
x=alt.X('x:Q'),
y=alt.Y('y:Q'),
text='text:N'
)
chart_and_text = chart #+ text
return chart_and_text#.display()
E2_E3_corr = plot_affinity_solid(df_affinity_filter_merge)
E2_E3_corr.display()
E2_E3_corr.save(E2_E3_correlation)
Plot correlations between summary statistics for each site¶
In [12]:
def plot_affinity_solid_mean(df):
df = df.dropna()
means = df.groupby('site').agg({
'effect_E2': 'median',
'effect_E3': 'median',
'binding_median_E2': 'median',
'binding_median_E3': 'median',
'wildtype': 'first'
}).reset_index()
slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(means['binding_median_E2'], means['binding_median_E3'])
r_value = float(r_value)
chart = alt.Chart(means,title=alt.Title('Correlation between Aggregate Mutant Binding Scores',subtitle=f'r={r_value:.2f}')).mark_point(size=50, color='black', opacity=0.3,filled=True).encode(
x=alt.X('binding_median_E2', title=('Median Ephrin-B2 Binding'), axis=alt.Axis(tickCount=3)),
y=alt.Y('binding_median_E3', title=('Median Ephrin-B3 Binding'), axis=alt.Axis(tickCount=3)),
tooltip=['site', 'wildtype','binding_median_E2','binding_median_E3','effect_E2','effect_E3'],
).properties(
width=300,
height=300
)
text = alt.Chart({'values':[{'x': -3.5, 'y': 0.5, 'text': f'r = {r_value:.2f}'}]}).mark_text(
align='left', baseline='top', dx=0, dy=-10).encode(
x=alt.X('x:Q'),
y=alt.Y('y:Q'),
text='text:N'
)
chart_and_text = chart #+ text
return chart_and_text#.display()
E2_E3_site_corr = plot_affinity_solid_mean(df_affinity_filter_merge)
E2_E3_site_corr.display()
E2_E3_site_corr.save(E2_E3_correlation_site)
(E2_E3_corr | E2_E3_site_corr).save(combined_E2_E3_site_corr)
Make plot showing binding by site (median)¶
In [13]:
def plot_affinity_by_site_median(df):
variant_selector = alt.selection_point(
on="mouseover",
empty=False,
fields=["site"],
value=0
)
empty_charts = []
for selection in ['binding_median_E2','binding_median_E3']:
if selection == 'binding_median_E2':
name = 'Ephrin-B2 Binding'
else:
name = 'Ephrin-B3 Binding'
mean = df.groupby('site')[selection].sum().reset_index()
chart = alt.Chart(mean).mark_point(size=60, color='black', stroke='black',filled=True).encode(
x=alt.X('site', title=('Site'), axis=alt.Axis(grid=True, tickCount=4),scale=alt.Scale(domain=[70,602])),
y=alt.Y(selection, title=(name), axis=alt.Axis(grid=True, tickCount=4)),
tooltip=['site'],
color=alt.condition(variant_selector, alt.value('orange'), alt.value('black')),
opacity=alt.condition(variant_selector, alt.value(1), alt.value(0.4)),
strokeWidth=alt.condition(variant_selector,alt.value(2),alt.value(0))
).properties(
width=700,
height=200
).add_params(variant_selector)
empty_charts.append(chart)
combined_chart = alt.vconcat(*empty_charts, spacing=1,title='Summed Binding by Site')
return combined_chart
binding_by_site = plot_affinity_by_site_median(df_affinity_filter_merge)
binding_by_site.display()
binding_by_site.save(binding_by_site_plot)
Make bubble plots for binding in different areas of receptor pocket¶
In [14]:
def make_boxplot_binding_region(df,title):# Create a box plot using Altair for aggregated means
barrel_ranges = {
'Hydrophobic': config['hydrophobic'],
'Salt Bridges': config['salt_bridges'],
'Hydrogen Bonds': config['h_bond_total'],
'Contact': config['contact_sites'],
'Overall': list(range(71,602)),
}
mean_df = df.groupby('site')[['binding_median']].median().reset_index()
custom_order = ['Hydrophobic','Salt Bridges','Hydrogen Bonds','Contact','Overall']
agg_means = []
# For each barrel, filter the site_means dataframe to the sites belonging to that barrel and then store the means
for barrel, sites in barrel_ranges.items():
subset = mean_df[mean_df['site'].isin(sites)]
for _, row in subset.iterrows():
agg_means.append({'barrel': barrel, 'effect': row['binding_median'],'site':row['site']})
agg_means_df = pd.DataFrame(agg_means)
chart = alt.Chart(agg_means_df).mark_point(filled=True,size=70,opacity=0.4,color='black').encode(
x=alt.X('barrel:O', sort=custom_order,title=None,axis=alt.Axis(labelAngle=-90)),
y=alt.Y('effect',title=f'Median {title} Binding',axis=alt.Axis(grid=True,tickCount=4)),
xOffset='random:Q',
color = alt.Color('barrel').legend(None),
tooltip=['barrel', 'effect','site'],
).transform_calculate(
random="sqrt(-1*log(random()))*cos(2*PI*random())"
).properties(
height=alt.Step(20),
width=alt.Step(25)
)
return chart.display()
make_boxplot_binding_region(df_E2_filter,'EFNB2')
make_boxplot_binding_region(df_E3_filter,'EFNB3')
Make lineplot by site showing mean binding affinity¶
Plot histogram¶
In [15]:
def effect_histogram(df):
colors = {'E2': '#1f4e79', 'E3': '#ff7f0e'}
all_charts = []
for effect in ['E2','E3']:
func_effect_container = []
for func_effect in [-2]:# Melt the dataframe for the specific columns
df = df[
(df[f'effect_{effect}'] > func_effect)
]
color = colors[effect]
df_melted = df.melt(value_vars=['binding_median_E2', 'binding_median_E3'], var_name='Effect', value_name='Value')
# Histogram for 'effect_E2'
histogram = alt.Chart(df_melted[df_melted['Effect'] == f'binding_median_{effect}']).mark_bar(opacity=1,color=color).encode(
x=alt.X('Value', bin=alt.Bin(step=0.1), title=f'Binding {effect}',axis=alt.Axis(tickCount=4,values=[3,1,0,-1,-5]),scale=alt.Scale(domain=[-6,3])),
y=alt.Y('count()',stack=None,title='Count'),
#color=alt.Color('red'),
tooltip=['Effect', 'count()']
).properties(
width=150,
height=alt.Step(10)
)
func_effect_container.append(histogram)
combined_effect_chart = alt.hconcat(*func_effect_container).resolve_scale(y='shared', x='shared', color='independent')
all_charts.append(combined_effect_chart)
final_combined_chart = alt.vconcat(*all_charts).resolve_scale(y='independent', x='independent', color='independent')
return final_combined_chart.display()
effect_histogram(df_affinity_filter_merge)
#effect_histogram(df_affinity_filter_merge,'#ff7f0e','binding_mean_E3')
Make binding score heatmaps¶
First, prep data for heatmap¶
In [16]:
entropy_df = pd.read_csv(entropy_file)
entropy_df = entropy_df[['site','henipavirus_entropy']]
entropy_df = entropy_df.dropna(subset=['site'])
entropy_df['site'] = entropy_df['site'].astype('Int64')
entropy_df = entropy_df.rename(columns={'henipavirus_entropy':'entropy'})
entropy_df = entropy_df[['site','entropy']].drop_duplicates()
entropy_df['mutant'] = 'entropy'
entropy_df['wildtype'] = ''
entropy_df['effect'] = 'effect'
entropy_df.rename(columns={'entropy': 'value'}, inplace=True)
display(entropy_df.head(3))
def make_contact():
df = pd.DataFrame({
'site': config['contact_sites'],
'contact': [0.0] * len(config['contact_sites'])
})
df = df[['site','contact']]
df['mutant'] = 'contact'
df['wildtype'] = ''
df['effect'] = 'binding_median'
df.rename(columns={'contact':'value'}, inplace=True)
return df
contact_df = make_contact()
display(contact_df.head(3))
def make_empty_df_with_filtered(df,filter_df):
sites = range(71, 603)
amino_acids = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']
# Create the combination of each site with each amino acid
data = [{'site': site, 'mutant': aa} for site in sites for aa in amino_acids]
# Create the DataFrame
empty_df = pd.DataFrame(data)
all_sites_df = pd.merge(empty_df,df,on=['site','mutant'],how='left')
df_melted = all_sites_df.melt(id_vars=['site', 'mutant', 'wildtype'],
value_vars=['binding_median'],
var_name='effect', value_name='value')
df_filtered = filter_df.melt(id_vars=['site', 'mutant', 'wildtype'],
value_vars=['effect'],
var_name='effect', value_name='value')
df_test = pd.concat([df_melted,df_filtered,contact_df],ignore_index=True)
return df_test
empty_df_E2 = make_empty_df_with_filtered(df_E2_filter,df_E2_filter_missing)
empty_df_E3 = make_empty_df_with_filtered(df_E3_filter,df_E3_filter_missing)
| site | value | mutant | wildtype | effect | |
|---|---|---|---|---|---|
| 0 | 1 | -0.00 | entropy | effect | |
| 1 | 2 | 1.75 | entropy | effect | |
| 2 | 3 | 1.75 | entropy | effect |
| site | value | mutant | wildtype | effect | |
|---|---|---|---|---|---|
| 0 | 239 | 0.0 | contact | binding_median | |
| 1 | 240 | 0.0 | contact | binding_median | |
| 2 | 241 | 0.0 | contact | binding_median |
In [17]:
def plot_heatmap_full(df,legend_title):
# Create y-axis order list
custom_order = ['contact','R', 'K', 'H', 'D', 'E', 'Q', 'N', 'S', 'T', 'Y', 'W', 'F', 'A', 'I', 'L', 'M', 'V', 'G', 'P', 'C'] #custom_order = names + custom_order
final_df = df
full_ranges = [list(range(start, end)) for start, end in [(71, 204), (204, 337), (337, 470), (470, 603)]]
# Sort the dataframe by 'site' to ensure that duplicates are detected correctly.
final_df = final_df.sort_values('site')
sort_order = {mutant: i for i, mutant in enumerate(custom_order)}
final_df['mutant_rank'] = final_df['mutant'].map(sort_order) # Map the 'mutant' column to these ranks
# Now sort the dataframe by this rank
final_df = final_df.sort_values('mutant_rank')
# Drop the 'mutant_rank' column as it is no longer needed after sorting
final_df = final_df.drop(columns=['mutant_rank'])
sites = sorted(final_df['site'].unique(), key=lambda x: float(x))
if legend_title == 'Ephrin-B2 Binding':
color_scale_effect = alt.Scale(scheme='redblue', domainMid=0,domain=[-6,2.5]) #EFNB3 is -1.5,2, EFNB2 is -6,3
else:
color_scale_effect = alt.Scale(scheme='redblue', domainMid=0,domain=[-2,2]) #EFNB3 is -1.5,2, EFNB2 is -6,3
color_scale_entropy = alt.Scale(scheme='teals', domain=[0, 1],reverse=True)
color_scale_contact = alt.Scale(scheme='purples', domainMid=0, domain=[0, 5],reverse=True)
color_scale_features = alt.Scale(scheme='greys', domainMid=0, domain=[0, 5],reverse=True)
strokewidth_size = 0.25
effect_legend_added = False
charts = [] # container to hold the charts
for idx, subset in enumerate(full_ranges):
subset_df = final_df[final_df['site'].isin(subset)] #for the wrapping of sites
unique_wildtypes_df = subset_df.drop_duplicates(subset=['site','wildtype']) #for the wildtype mapping
is_last_plot = idx == len(full_ranges) - 1
x_axis = alt.Axis(labelAngle=-90,
labelExpr="datum.value % 10 === 0 ? datum.value : ''",
title="Site" if is_last_plot else None,
labels=True) # Only show labels for the last plot
#for subset in full_ranges:
#subset_df = final_df[final_df['site'].isin(subset)]
#unique_wildtypes_df = subset_df.drop_duplicates(subset=['site','wildtype'])
# The chart for the heatmap
base = (
alt.Chart(subset_df)
.encode(
x=alt.X('site:O', title='Site', sort=sites, axis=x_axis),
y=alt.Y('mutant', title='Amino Acid', sort=alt.EncodingSortField(field='sort_order', order='ascending'),axis=alt.Axis(grid=False)),
tooltip=['site','wildtype','mutant','value'],
)
.properties(
width=alt.Step(10),
height=alt.Step(11)
)
)
chart_empty = (
base.mark_rect(color='#d1d3d4').encode().transform_filter(
(alt.datum.effect == 'binding_median')
)
)
# Heatmap for effect
if not effect_legend_added:
chart_effect = (
base.mark_rect(stroke='black',strokeWidth = strokewidth_size).encode(
color=alt.condition(f'datum.mutant != "entropy" && datum.mutant != "contact"',
alt.Color('value:Q', scale=color_scale_effect,legend=alt.Legend(title=legend_title)),
alt.value('transparent')),
).transform_filter(
alt.datum.effect == 'binding_median'
)
)
effect_legend_added = True
else:
chart_effect = (
base.mark_rect(stroke='black',strokeWidth = strokewidth_size).encode(
color=alt.condition(f'datum.mutant != "entropy" && datum.mutant != "contact"',
alt.Color('value:Q', scale=color_scale_effect,legend=None),
alt.value('transparent')),
).transform_filter(
alt.datum.effect == 'binding_median'
)
)
chart_filtered = (
base.mark_rect(color='#939598',stroke='black',strokeWidth = strokewidth_size).encode(
).transform_filter(
alt.datum.effect == 'effect'
)
)
chart_contact = (
base.mark_rect(color='black').encode(
#color=alt.condition(f'datum.mutant != "entropy"',
#alt.Color('value:Q', scale=color_scale_contact,legend=None),
#alt.value('transparent')),
).transform_filter(
alt.datum.mutant == 'contact'
)
)
# The layer for the wildtype boxes
wildtype_layer_box = (
alt.Chart(unique_wildtypes_df).mark_rect(color='white',stroke='black',strokeWidth = strokewidth_size).encode(
x=alt.X('site:O', sort=sites),
y=alt.Y('wildtype', sort=alt.EncodingSortField(field='sort_order', order='ascending')),
opacity=alt.value(1)
)
.transform_filter(
(alt.datum.wildtype != '') & (alt.datum.wildtype != None) & (alt.datum.value != None)
)
)
# The layer for the wildtype amino acids
wildtype_layer = (
alt.Chart(unique_wildtypes_df).mark_text(color='black', text='X', size=8, align='center', baseline='middle').encode(
x=alt.X('site:O', sort=sites),
y=alt.Y('wildtype', sort=alt.EncodingSortField(field='sort_order', order='ascending')),
opacity=alt.value(1)
)
.transform_filter(
(alt.datum.wildtype != '') & (alt.datum.wildtype != None) & (alt.datum.value != None)
)
)
# Combine the heatmap layer with the wildtype layer
chart = alt.layer(chart_empty, chart_effect,chart_filtered,chart_contact, wildtype_layer_box,wildtype_layer).resolve_scale(color='independent',y='shared')
charts.append(chart)
combined_chart = alt.vconcat(*charts,spacing=3,title=f'{legend_title}').resolve_scale(y='independent', x='independent',color='shared')
return combined_chart#.display()
E2_binding_plot = plot_heatmap_full(empty_df_E2,'Ephrin-B2 Binding')
E2_binding_plot.display()
E2_binding_plot.save(E2_binding_heatmap)
E3_binding_plot = plot_heatmap_full(empty_df_E3,'Ephrin-B3 Binding')
E3_binding_plot.display()
E3_binding_plot.save(E3_binding_heatmap)
Now prep data for contact site heatmaps¶
In [18]:
entropy_df = pd.read_csv(entropy_file)
entropy_df = entropy_df[['site','henipavirus_entropy']]
entropy_df = entropy_df.dropna(subset=['site'])
entropy_df['site'] = entropy_df['site'].astype('Int64')
entropy_df = entropy_df.rename(columns={'henipavirus_entropy':'entropy'})
entropy_df = entropy_df[['site','entropy']].drop_duplicates()
entropy_df['mutant'] = 'entropy'
entropy_df['wildtype'] = ''
entropy_df['effect'] = 'effect'
entropy_df.rename(columns={'entropy': 'value'}, inplace=True)
#display(entropy_df.head(3))
def make_contact(list_,name_):
df = pd.DataFrame({
'site': list_,
name_: [0.0] * len(list_)
})
df = df[['site',name_]]
df['mutant'] = name_
df['wildtype'] = ''
df['effect'] = 'binding_median'
df.rename(columns={name_:'value'}, inplace=True)
return df
contact_df = make_contact(config['contact_sites'],'contact')
#hydrophobic_df = make_contact(hydrophobic,'hydrophobic')
#salt_bridge_df = make_contact(salt_bridges,'salt bridge')
#h_bond_df = make_contact(h_bond_total,'hydrogen bond')
#display(contact_df.head(3))
def make_empty_df_with_filtered(df,filtered_df):
sites = range(71, 603)
amino_acids = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']
# Create the combination of each site with each amino acid
data = [{'site': site, 'mutant': aa} for site in sites for aa in amino_acids]
# Create the DataFrame
empty_df = pd.DataFrame(data)
all_sites_df = pd.merge(empty_df,df,on=['site','mutant'],how='left')
df_melted = all_sites_df.melt(id_vars=['site', 'mutant', 'wildtype'],
value_vars=['binding_median'],
var_name='effect', value_name='value')
df_filtered = filtered_df.melt(id_vars=['site', 'mutant', 'wildtype'],
value_vars=['effect'],
var_name='effect', value_name='value')
df_test = pd.concat([df_melted,df_filtered],ignore_index=True) #contact_df
return df_test
empty_df_E2 = make_empty_df_with_filtered(df_E2_filter,df_E2_filter_missing)
empty_df_E3 = make_empty_df_with_filtered(df_E3_filter,df_E3_filter_missing)
In [ ]:
In [19]:
def plot_heatmap_contact(df,subset,name):
# Create y-axis order list
custom_order = ['R', 'K', 'H', 'D', 'E', 'Q', 'N', 'S', 'T', 'Y', 'W', 'F', 'A', 'I', 'L', 'M', 'V', 'G', 'P', 'C'] #custom_order = names + custom_order
final_df = df
full_ranges = [list(range(start, end)) for start, end in [(71, 204), (204, 337), (337, 470), (470, 603)]]
# Sort the dataframe by 'site' to ensure that duplicates are detected correctly.
final_df = final_df.sort_values('site')
sort_order = {mutant: i for i, mutant in enumerate(custom_order)}
final_df['mutant_rank'] = final_df['mutant'].map(sort_order) # Map the 'mutant' column to these ranks
# Now sort the dataframe by this rank
final_df = final_df.sort_values('mutant_rank')
# Drop the 'mutant_rank' column as it is no longer needed after sorting
final_df = final_df.drop(columns=['mutant_rank'])
sites = sorted(final_df['site'].unique(), key=lambda x: float(x))
if name == 'Ephrin-B2 Binding':
color_scale_effect = alt.Scale(scheme='redblue', domainMid=0,domain=[-6,2.5])
else:
color_scale_effect = alt.Scale(scheme='redblue', domainMid=0,domain=[-2,1.5])
color_scale_entropy = alt.Scale(scheme='teals', domain=[0, 1],reverse=True)
color_scale_contact = alt.Scale(scheme='teals', domainMid=0, domain=[0, 5],reverse=True)
color_scale_features = alt.Scale(scheme='greys', domainMid=0, domain=[0, 5],reverse=True)
subset_df = final_df[final_df['site'].isin(subset)]
unique_wildtypes_df = subset_df.drop_duplicates(subset=['site','wildtype'])
strokewidth_size = 0.25
charts = [] # container to hold the charts
# The chart for the heatmap
base = (
alt.Chart(subset_df)
.encode(
x=alt.X('site:O', title='Contact Site', sort=sites, axis=alt.Axis(labelAngle=-90,grid=False)),
y=alt.Y('mutant', title='Amino Acid', sort=alt.EncodingSortField(field='sort_order', order='ascending'),axis=alt.Axis(grid=False)),
tooltip=['site','wildtype','mutant','value'],
)
.properties(
width=alt.Step(10),
height=alt.Step(11)
)
)
chart_empty = (
base.mark_rect(color='#e6e7e8').encode().transform_filter(
(alt.datum.effect == 'binding_median')
)
)
# Heatmap for effect
chart_effect = (
base.mark_rect(stroke='black',strokeWidth = strokewidth_size).encode(
color=alt.condition(f'datum.mutant != "entropy" && datum.mutant != "contact"',
alt.Color('value:Q', scale=color_scale_effect,legend=alt.Legend(title='binding')),
alt.value('transparent')),
).transform_filter(
alt.datum.effect == 'binding_median'
)
)
chart_filtered = (
base.mark_rect(color='#939598',stroke='black',strokeWidth = strokewidth_size).encode(
).transform_filter(
alt.datum.effect == 'effect'
)
)
chart_contact = (
base.mark_rect().encode(
color=alt.condition(f'datum.mutant != "entropy"',
alt.Color('value:Q', scale=color_scale_contact,legend=None),
alt.value('transparent')),
).transform_filter(
(alt.datum.mutant == 'hydrophobic') | (alt.datum.mutant == 'salt bridge') | (alt.datum.mutant == 'hydrogen bond')
)
)
# The layer for the wildtype boxes
wildtype_layer_box = (
alt.Chart(unique_wildtypes_df).mark_rect(color='white',stroke='black',strokeWidth = strokewidth_size).encode(
x=alt.X('site:O', sort=sites),
y=alt.Y('wildtype', sort=alt.EncodingSortField(field='sort_order', order='ascending')),
opacity=alt.value(1)
)
.transform_filter(
(alt.datum.wildtype != '') & (alt.datum.wildtype != None) & (alt.datum.value != None)
)
)
# The layer for the wildtype amino acids
wildtype_layer = (
alt.Chart(unique_wildtypes_df).mark_text(color='black', text='X', size=8, align='center', baseline='middle').encode(
x=alt.X('site:O', sort=sites),
y=alt.Y('wildtype', sort=alt.EncodingSortField(field='sort_order', order='ascending')),
opacity=alt.value(1)
)
.transform_filter(
(alt.datum.wildtype != '') & (alt.datum.wildtype != None) & (alt.datum.value != None)
)
)
# Combine the heatmap layer with the wildtype layer
chart = alt.layer(chart_empty, chart_effect,chart_filtered,wildtype_layer_box,wildtype_layer).resolve_scale(color='independent') #chart_contact,
charts.append(chart)
combined_chart = alt.vconcat(*charts,title=f'{name}').resolve_scale(y='independent', x='independent',color='shared')
return combined_chart.display()
plot_heatmap_contact(empty_df_E2,config['contact_sites'],'Ephrin-B2 Binding')
plot_heatmap_contact(empty_df_E3,config['contact_sites'],'Ephrin-B3 Binding')
Plot heatmap of sites with mutant effects different between amino acid types¶
In [20]:
def check_muts_by_properties(df,min_group_size,positive_value,negative_value):
letter_to_type = {'D': 'negative',
'E': 'negative',
'K': 'positive',
'R': 'positive',
'H': 'positive',
'C': 'special',
'S': 'hydrophilic',
'T': 'hydrophilic',
'N': 'hydrophilic',
'Q': 'hydrophilic',
'G': 'special',
'A': 'hydrophobic',
'V': 'hydrophobic',
'L': 'hydrophobic',
'I': 'hydrophobic',
'M': 'hydrophobic',
'P': 'special',
'F': 'aromatic',
'Y': 'armomatic',
'W': 'aromatic',
'*': 'stop'}
df['mutant_type'] = df['mutant'].map(letter_to_type)
#display(df)
#df = df[df['mutant_type'] != 'special']
#min_group_size = 3 # Set your minimum group size
grouped = df.groupby(['site', 'mutant_type'])
filtered_groups = grouped.filter(lambda x: (x['binding_median'] > positive_value).all() and len(x) >= min_group_size)
positive_mask = (
filtered_groups.groupby('site')['binding_median'].transform(lambda x: (x > positive_value).all()) &
df.groupby('site')['binding_median'].transform(lambda x: (x <= positive_value).any())
)
positive_sites = df[positive_mask]
filtered_groups_neg = grouped.filter(lambda x: (x['binding_median'] < negative_value).all() and len(x) >= min_group_size)
negative_mask = (
filtered_groups_neg.groupby('site')['binding_median'].transform(lambda x: (x < negative_value).all()) &
df.groupby('site')['binding_median'].transform(lambda x: (x <= negative_value).any())
)
negative_sites = df[negative_mask]
common_sites = pd.merge(positive_sites, negative_sites, on='site', how='inner')
final_sites = common_sites['site'].drop_duplicates()
return final_sites
final_sites_E2 = check_muts_by_properties(df_E2_filter,1,0,-0.5)
final_sites_E3 = check_muts_by_properties(df_E3_filter,1,0,-0.5)
print(list(final_sites_E2))
[153, 169, 202, 214, 215, 232, 233, 235, 241, 244, 245, 246, 247, 250, 268, 271, 284, 305, 309, 310, 319, 337, 347, 362, 378, 388, 394, 404, 406, 409, 440, 458, 463, 479, 482, 508, 511, 525, 529, 555, 577, 578, 591, 593]
In [21]:
def plot_heatmap_pref_switch(df,subset,name):
# Create y-axis order list
custom_order = ['contact','R', 'K', 'H', 'D', 'E', 'Q', 'N', 'S', 'T', 'Y', 'W', 'F', 'A', 'I', 'L', 'M', 'V', 'G', 'P', 'C'] #custom_order = names + custom_order
final_df = df
full_ranges = [list(range(start, end)) for start, end in [(71, 204), (204, 337), (337, 470), (470, 603)]]
# Sort the dataframe by 'site' to ensure that duplicates are detected correctly.
final_df = final_df.sort_values('site')
sort_order = {mutant: i for i, mutant in enumerate(custom_order)}
final_df['mutant_rank'] = final_df['mutant'].map(sort_order) # Map the 'mutant' column to these ranks
# Now sort the dataframe by this rank
final_df = final_df.sort_values('mutant_rank')
# Drop the 'mutant_rank' column as it is no longer needed after sorting
final_df = final_df.drop(columns=['mutant_rank'])
sites = sorted(final_df['site'].unique(), key=lambda x: float(x))
if name == 'EFNB2 Binding':
color_scale_effect = alt.Scale(scheme='redblue', domainMid=0) #EFNB3 is -1.5,2, EFNB2 is -6,3
else:
color_scale_effect = alt.Scale(scheme='redblue', domainMid=0) #EFNB3 is -1.5,2, EFNB2 is -6,3
color_scale_entropy = alt.Scale(scheme='teals', domain=[0, 1],reverse=True)
color_scale_contact = alt.Scale(scheme='teals', domainMid=0, domain=[0, 5],reverse=True)
color_scale_features = alt.Scale(scheme='greys', domainMid=0, domain=[0, 5],reverse=True)
subset_df = final_df[final_df['site'].isin(subset)]
unique_wildtypes_df = subset_df.drop_duplicates(subset=['site','wildtype'])
strokewidth_size = 0.25
charts = [] # container to hold the charts
# The chart for the heatmap
base = (
alt.Chart(subset_df)
.encode(
x=alt.X('site:O', title=None, sort=sites, axis=alt.Axis(labelAngle=-90,grid=False)),
y=alt.Y('mutant', title=None, sort=alt.EncodingSortField(field='sort_order', order='ascending'),axis=alt.Axis(grid=False)),
tooltip=['site','wildtype','mutant','value'],
)
.properties(
width=alt.Step(10),
height=alt.Step(11)
)
)
chart_empty = (
base.mark_rect(color='#e6e7e8').encode().transform_filter(
(alt.datum.effect == 'binding_median')
)
)
# Heatmap for effect
chart_effect = (
base.mark_rect(stroke='black',strokeWidth = strokewidth_size).encode(
color=alt.condition(f'datum.mutant != "entropy" && datum.mutant != "contact"',
alt.Color('value:Q', scale=color_scale_effect,legend=alt.Legend(title=name)),
alt.value('transparent')),
).transform_filter(
alt.datum.effect == 'binding_median'
)
)
chart_filtered = (
base.mark_rect(color='#939598',stroke='black',strokeWidth = strokewidth_size).encode(
).transform_filter(
alt.datum.effect == 'effect'
)
)
chart_contact = (
base.mark_rect().encode(
color=alt.condition(f'datum.mutant != "entropy"',
alt.Color('value:Q', scale=color_scale_contact,legend=None),
alt.value('transparent')),
).transform_filter(
(alt.datum.mutant == 'hydrophobic') | (alt.datum.mutant == 'salt bridge') | (alt.datum.mutant == 'hydrogen bond')
)
)
# The layer for the wildtype boxes
wildtype_layer_box = (
alt.Chart(unique_wildtypes_df).mark_rect(color='white',stroke='black',strokeWidth = strokewidth_size).encode(
x=alt.X('site:O', sort=sites),
y=alt.Y('wildtype', sort=alt.EncodingSortField(field='sort_order', order='ascending')),
opacity=alt.value(1)
)
.transform_filter(
(alt.datum.wildtype != '') & (alt.datum.wildtype != None) & (alt.datum.value != None)
)
)
# The layer for the wildtype amino acids
wildtype_layer = (
alt.Chart(unique_wildtypes_df).mark_text(color='black', text='X', size=8, align='center', baseline='middle').encode(
x=alt.X('site:O', sort=sites),
y=alt.Y('wildtype', sort=alt.EncodingSortField(field='sort_order', order='ascending')),
opacity=alt.value(1)
)
.transform_filter(
(alt.datum.wildtype != '') & (alt.datum.wildtype != None) & (alt.datum.value != None)
)
)
# Combine the heatmap layer with the wildtype layer
chart = alt.layer(chart_empty, chart_effect,chart_filtered,chart_contact, wildtype_layer_box,wildtype_layer).resolve_scale(color='independent')
charts.append(chart)
combined_chart = alt.vconcat(*charts).resolve_scale(y='independent', x='independent',color='shared')
return combined_chart.display()
cysteine_neck = [146, 158, 162]
cysteine_1 = [189,601]
cysteine_2 = [216, 240]
cysteine_3 = [282,295]
cysteine_4 = [382, 395]
cysteine_5 = [387, 499]
cysteine_6 = [493, 503]
cysteine_7 = [565, 574]
cysteine = cysteine_neck + cysteine_1 + cysteine_2 + cysteine_3 + cysteine_4 + cysteine_5 + cysteine_6 + cysteine_7
#print(cysteine)
n_linked = [159,306,378,417,481,529]
#n_linked_removal = [161,308,380,419,483,531]
stalk = list(range(96, 147))
neck = list(range(148,166))
linker = list(range(166,177))
#PNLG = [164, 180, 187, 191, 195, 275, 288, 311, 326, 359, 403, 423, 430, 473, 478, 543, 554, 570, 600]
total_list = [n_linked,stalk, neck, linker,cysteine]
for i in total_list:
plot_heatmap_pref_switch(empty_df_E2,i,'EFNB2 Binding')
plot_heatmap_pref_switch(empty_df_E3,i,'EFNB3 Binding')
Now plot heatmaps depending on wildtype amino acid property¶
In [22]:
hydrophobic_AA = ['A','V','L','I','M']
aromatic_AA = ['Y','W','F']
positive_AA = ['K','R','H']
negative_AA = ['E','D']
hydrophilic_AA = ['S','T','N','Q']
def find_aa_wildtype_sites(df,aa_type):
aa_list = list(df[df['wildtype'].isin(aa_type)]['site'].unique())
return aa_list
hydrophobic_AA_list = find_aa_wildtype_sites(df_E2_filter,hydrophobic_AA)
aromatic_AA_list = find_aa_wildtype_sites(df_E2_filter,aromatic_AA)
positive_AA_list = find_aa_wildtype_sites(df_E2_filter,positive_AA)
negative_AA_list = find_aa_wildtype_sites(df_E2_filter,negative_AA)
hydrophilic_AA_list = find_aa_wildtype_sites(df_E2_filter,hydrophilic_AA)
all_AA = [hydrophobic_AA_list,aromatic_AA_list,positive_AA_list,negative_AA_list,hydrophilic_AA_list]
for i in all_AA:
plot_heatmap_pref_switch(empty_df_E2,i,'EFNB2 Binding')
plot_heatmap_pref_switch(empty_df_E3,i,'EFNB3 Binding')
In [ ]: